
rm(list=ls())
## load tsv data table

library(xlsx)
data_uri = read.xlsx("Eisner/muscleloss2010.xlsx",1,stringsAsFactors=FALSE)
sample_info =  read.xlsx("Eisner/urinarymetaboliteCD.xlsx",1,stringsAsFactors=FALSE)

identical(sample_info$Patient.ID, colnames(data_uri)[-c(1,2)])

# ## 1.  remove metabolites with unknown chebi ID
# data_chebi = data[sapply(data[,1], function(x) grepl("CHEBI", x) ),]
# 
# ## 2.  remove columns with NAs
# data_nonna = data_chebi[,!apply(is.na(data_chebi), 2, any)]
# 
# ## 3. remove metabolites with >=50% zeros
# data_metab = apply(data_nonna[,8:19], c(1,2), as.numeric)
# metab_delete = rowMeans(data_metab==0) >= 0.5
# data_uri = data_nonna[!metab_delete,]


## 4. save the clean data
# library(xlsx)
write.xlsx(data_uri,"Eisner/meta_uri.xlsx")

## save chebi IDs for generating pathways
chebi_uri = data_uri$chebi_ID
write.table(chebi_uri, file = "Eisner/chebi_uri.txt",row.names = F,col.names = F, quote = FALSE)

## 5. save the data matrix
data_matrix = apply(t(data_uri[,-c(1,2)]), c(1,2), as.numeric)
rownames(data_matrix)
colnames(data_matrix) = data_uri$meta_ID

## 6. normalized the data
hist(data_matrix)
if(min(data_matrix)>0){
  data_matrix_normal = log2(data_matrix)
  hist(data_matrix_normal)
} else{
  a = 1 ## default value, could be other values 
  data_matrix_normal = log2((data_matrix + sqrt(data_matrix^2 + a^2))/2)
  hist(data_matrix_normal)
}

y = as.numeric(grepl("cachexic", sample_info$Muscle.loss ))
d_uri = cbind(data_matrix_normal, y) 
save(d_uri, file="Eisner/d_uri.RData")



############################   ========================= smpdb_HMDB_pathway =========================   ############################ 

smpdbhmdb <- data.frame(fread("Eisner/pathwaydatabase_uri/mbrole2_smpdbhmdb.tsv"))
smpdbhmdbpath_chebi = smpdbhmdb$Submitted.IDs
names(smpdbhmdbpath_chebi) = smpdbhmdb$ID.Annotation
listsmpdbhmdbpath = lapply(smpdbhmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listsmpdbhmdbpath_uniq = listsmpdbhmdbpath[which(!duplicated(listsmpdbhmdbpath))]

## map cheiID to metabolites identifies
smpdbhmdbpath <- lapply(listsmpdbhmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_uri$meta_ID[which(chebi_uri %in% x )] )

smpdbhmdbpath_metab = smpdbhmdbpath[which(!duplicated(smpdbhmdbpath))]

save(smpdbhmdbpath_metab, file="Eisner/pathwaydatabase_uri/smpdbhmdbpath_metab_uri.RData")




############################   ========================= Biocycpathway =========================   ############################ 

biocyc <- data.frame(fread("Eisner/pathwaydatabase_uri/mbrole2_biocyc.tsv"))
biocycpath_chebi = biocyc$Submitted.IDs
names(biocycpath_chebi) = biocyc$ID.Annotation
listbiocycpath = lapply(biocycpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listbiocycpath_uniq = listbiocycpath[which(!duplicated(listbiocycpath))]

## map cheiID to metabolites identifies
biocycpath <- lapply(listbiocycpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_uri$meta_ID[which(chebi_uri %in% x )] )

biocycpath_metab = biocycpath[which(!duplicated(biocycpath))]

save(biocycpath_metab, file="Eisner/pathwaydatabase_uri/biocycpath_metab_uri.RData")



############################   ========================= keggpathway =========================   ############################ 

kegg<- data.frame(fread("Eisner/pathwaydatabase_uri/mbrole2_kegg.tsv"))
keggpath_chebi = kegg$Submitted.IDs
names(keggpath_chebi) = kegg$ID.Annotation
listkeggpath = lapply(keggpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listkeggpath_uniq = listkeggpath[which(!duplicated(listkeggpath))]

## map cheiID to metabolites identifies
keggpath <- lapply(listkeggpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_uri$meta_ID[which(chebi_uri %in% x )] )

keggpath_metab = keggpath[which(!duplicated(keggpath))]

save(keggpath_metab, file="Eisner/pathwaydatabase_uri/keggpath_metab_uri.RData")






############################   ========================= wikipathway =========================   ############################ 
library(rWikiPathways)
listMetab <- NULL
Pathways<-listPathwayIds(organism="Homo sapiens")
wikipath_info<-data.frame(id=1:length(Pathways))
wikipath_info$names<-Pathways
#list of chebi IDs per pathway
listMetab<-apply(as.matrix(Pathways),1, function(x) getXrefList(pathway = x, systemCode="Ce"))

#remove chebiID character
listMetab<-lapply(listMetab, function(x) gsub('^\\CHEBI:|\\$', '', x))
listMetab<-lapply(listMetab, function(x) unique(x))
wikipath_info$chebi_path = listMetab
wikipath_info$path_length<-sapply(listMetab, function(x) length(x))


#map chebi IDs to  your own metabolite names
wikiMetab<-lapply(listMetab, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_uri$meta_ID[which(sapply(chebi_uri, function(x) gsub('^\\CHEBI:|\\$', '', x)) %in% x)])  

# remove empty pathways
wikipath = wikiMetab[sapply(wikiMetab,length)>0]
wikipath_uniq = wikipath[which(!duplicated(wikipath))]
save(wikipath_uniq, file="Eisner/wikipath_metab_uri.RData")




# ############################   ========================= smpdb_ECMDB_pathway =========================   ############################ 
# 
# smpdbecmdb <- data.frame(fread("meta_uri/mbrole2_smpdbecmdb.tsv"))
# smpdbecmdbpath_chebi = smpdbecmdb$Submitted.IDs
# names(smpdbecmdbpath_chebi) = smpdbecmdb$ID.Annotation
# listsmpdbecmdbpath = lapply(smpdbecmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbecmdbpath_uniq = listsmpdbecmdbpath[which(!duplicated(listsmpdbecmdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbecmdbpath <- lapply(listsmpdbecmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_uri$meta_ID[which(chebi_uri %in% x )] )
# 
# smpdbecmdbpath_metab = smpdbecmdbpath[which(!duplicated(smpdbecmdbpath))]
# 
# save(smpdbecmdbpath_metab, file="meta_uri/smpdbecmdbpath_metab_uri.RData")
# 
# 
# 
# 
# ############################   ========================= smpdb_YMDB_pathway =========================   ############################ 
# 
# smpdbymdb <- data.frame(fread("meta_uri/mbrole2_smpdbymdb.tsv"))
# smpdbymdbpath_chebi = smpdbymdb$Submitted.IDs
# names(smpdbymdbpath_chebi) = smpdbymdb$ID.Annotation
# listsmpdbymdbpath = lapply(smpdbymdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbymdbpath_uniq = listsmpdbymdbpath[which(!duplicated(listsmpdbymdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbymdbpath <- lapply(listsmpdbymdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_uri$meta_ID[which(chebi_uri %in% x )] )
# 
# smpdbymdbpath_metab = smpdbymdbpath[which(!duplicated(smpdbymdbpath))]
# 
# save(smpdbymdbpath_metab, file="meta_uri/smpdbymdbpath_metab_uri.RData")


############################   ========================= chebirole =========================   ############################ 

# chebirole<- data.frame(fread("Eisner/pathwaydatabase_uri//mbrole2_chebirole.tsv"))
# chebirolepath_chebi = chebirole$Submitted.IDs
# names(chebirolepath_chebi) = chebirole$ID.Annotation
# listchebirolepath = lapply(chebirolepath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listchebirolepath_uniq = listchebirolepath[which(!duplicated(listchebirolepath))]
# 
# ## map cheiID to metabolites identifies
# chebirolepath <- lapply(listchebirolepath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_uri$meta_ID[which(chebi_uri %in% x )] )
# 
# chebirolepath_metab = chebirolepath[which(!duplicated(chebirolepath))]
# 
# save(chebirolepath_metab, file="Eisner/pathwaydatabase_uri//chebirolepath_metab_uri.RData")



############################   ========================= proteinhmdb =========================   ############################ 

# proteinhmdb<- data.frame(fread("Eisner/pathwaydatabase_uri//mbrole2_proteinhmdb.tsv"))
# proteinhmdbpath_chebi = proteinhmdb$Submitted.IDs
# names(proteinhmdbpath_chebi) = proteinhmdb$ID.Annotation
# listproteinhmdbpath = lapply(proteinhmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listproteinhmdbpath_uniq = listproteinhmdbpath[which(!duplicated(listproteinhmdbpath))]
# 
# ## map cheiID to metabolites identifies
# proteinhmdbpath <- lapply(listproteinhmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_uri$meta_ID[which(chebi_uri %in% x )] )
# 
# proteinhmdbpath_metab = proteinhmdbpath[which(!duplicated(proteinhmdbpath))]
# 
# save(proteinhmdbpath_metab, file="Eisner/pathwaydatabase_uri//proteinhmdbpath_metab_uri.RData")
# 
# 
# ############################   ========================= biofunction =========================   ############################ 
# 
# biofunction<- data.frame(fread("Eisner/pathwaydatabase_uri//mbrole2_biofunction.tsv"))
# biofunctionpath_chebi = biofunction$Submitted.IDs
# names(biofunctionpath_chebi) = biofunction$ID.Annotation
# listbiofunctionpath = lapply(biofunctionpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listbiofunctionpath_uniq = listbiofunctionpath[which(!duplicated(listbiofunctionpath))]
# 
# ## map cheiID to metabolites identifies
# biofunctionpath <- lapply(listbiofunctionpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_uri$meta_ID[which(chebi_uri %in% x )] )
# 
# biofunctionpath_metab = biofunctionpath[which(!duplicated(biofunctionpath))]
# 
# save(biofunctionpath_metab, file="Eisner/pathwaydatabase_uri//biofunctionpath_metab_uri.RData")


